setwd("C:/Users/horat/Desktop/CSIROIntership/soilCode")
library(dplyr)
#create pivot table
library(reshape)
library(data.table)
#data partition seperate trainset and testset
library (caTools)
library(caret)
#svm library due to limitation of iterations change the library
library(e1071)
library(LiblineaR)
#random forest
library(randomForest)
#ID4 Decision Tree classifier(CART)
library(rpart)
library(rpart.plot)
library(rattle)
#xgboost
library(xgboost)
#for knn classification
library(class)
#install neuralnetwork
library(neuralnet)
#adabag library
library(adabag)
#Stochastic Gradient Descent (SGD) Method Learning Function
library(gradDescent)
library(lightgbm)
#https://www.kaggle.com/c/amazon-employee-access-challenge/discussion/5128#38925
#matrix library
library(Matrix)
#catboost
library(catboost)
#fast naive bayes
library("fastNaiveBayes")
#tidyverse for easy data manipulation and visualization
#caret for easy machine learning workflow
library(tidyverse)
library(caret)
featureSoilTable <- read.csv(file = "featureTable.csv",stringsAsFactors=FALSE)
print(head(featureSoilTable))
normalize <- function(x){
return (as.numeric((x-min(x))/(max(x)-min(x))))
}
#change the NULL to na
featureSoilTable['h_texture'][featureSoilTable['h_texture'] == "NULL"] <- NA
#add appendix to colname:
colnames(featureSoilTable) <- paste("Str",colnames(featureSoilTable),sep = "_")
print(head(featureSoilTable))
#extract valid and invalid soil sample
validsoilTexture <- featureSoilTable[!is.na(featureSoilTable$Str_h_texture),]
invalidsoilTexture <- featureSoilTable[is.na(featureSoilTable$Str_h_texture),]
#remove all columns with na
validsoilTexture <- validsoilTexture[,colSums(is.na(validsoilTexture))<nrow(validsoilTexture)]
#change null value to 0
validsoilTexture[is.na(validsoilTexture)] = 0
validsoilTexture$Str_h_texture <- as.numeric(as.factor(validsoilTexture$Str_h_texture))
validsoilTexture <- apply(validsoilTexture, 2, as.factor)
validsoilTexture <- apply(validsoilTexture, 2, as.numeric)
validsoilTexture[,-1]<- (apply(validsoilTexture[,-1],2,normalize))
print(head(validsoilTexture))
set.seed(122)
split = sample.split(validsoilTexture$Str_h_texture,SplitRatio = 0.7)
train_set = subset(validsoilTexture, split == TRUE)
test_set = subset(validsoilTexture, split == FALSE)
train_set$Str_h_texture = as.numeric(train_set$Str_h_texture)
test_set$Str_h_texture = as.numeric(test_set$Str_h_texture)
summary(train_set)
Str_h_texture Str_samp_no Str_labr_no Str_X1.40E.02 Str_X1.40E.04 Str_X1.80E.03 Str_X10_BC
Min. : 1.00 Min. :0.00000 Min. :0.00000 Min. :0.00e+00 Min. :0.0000000 Min. :0.000000 Min. :0.0000000
1st Qu.:27.00 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000
Str_X10A_NR Str_X10A1 Str_X10B Str_X10B_NR Str_X10B1 Str_X10B3 Str_X10D1
Min. :0.0000000 Min. :0.00e+00 Min. :0.0000000 Min. :0.00e+00 Min. :0.000000 Min. :0.0000000 Min. :0.0000000
1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000
Str_X11A1 Str_X12_HCL_CU Str_X12_HCL_FE Str_X12_HCL_MN Str_X12_HCL_ZN Str_X12_HF_CU Str_X12_HF_FE
Min. :0.0000000 Min. :0.00e+00 Min. :0.000000 Min. :0.00e+00 Min. :0.00e+00 Min. :0.000000 Min. :0.000000
1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.000000 1st Qu.:0.00e+00 1st Qu.:0.00e+00 1st Qu.:0.002107 1st Qu.:0.000000
Str_X12_HF_MN Str_X12_HF_ZN Str_X12_NR_CU Str_X12_NR_FE Str_X12_NR_MN Str_X12_NR_ZN Str_X12_XRF_CU
Min. :0.0000000 Min. :0.0000000 Min. :0.00e+00 Min. :0.0000000 Min. :0.00e+00 Min. :0.0000000 Min. :0.0000000
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.0000000
Str_X12_XRF_FE Str_X12_XRF_MN Str_X12_XRF_ZN Str_X12A1_CU Str_X12A1_FE Str_X12A1_MN Str_X12A1_ZN
Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.00000 Min. :0.000000 Min. :0.0000000
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000000
Str_X12B1_CU Str_X12B1_ZN Str_X12C1 Str_X12C2 Str_X13_C_FE Str_X13_NR_AL Str_X13_NR_FE
Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.000e+00 Min. :0.0000000 Min. :0.00e+00
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000e+00 1st Qu.:0.0000000 1st Qu.:0.00e+00
Str_X13_NR_MN Str_X13A1_AL Str_X13A1_FE Str_X13A1_MN Str_X13A1_SI Str_X13B1_AL Str_X13B1_FE
Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000e+00 Min. :0.000e+00
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000e+00 1st Qu.:0.000e+00
Str_X13C_C_FE Str_X13C1_AL Str_X13C1_FE Str_X13C1_FE203 Str_X13C1_MN Str_X13C1_SI Str_X14_NR_S
Min. :0.00e+00 Min. :0.0000000 Min. :0.000000 Min. :0.00e+00 Min. :0.000000 Min. :0.0000000 Min. :0.0000000
1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.00e+00 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000
Str_X140 Str_X14B1 Str_X14C1 Str_X14D1_C Str_X14D2_BC Str_X14F1 Str_X14H1_CA
Min. :0.00e+00 Min. :0.0000000 Min. :0.0000000 Min. :0.00e+00 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000
1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000
Str_X14H1_K Str_X14H1_MG Str_X14H1_NA Str_X15_BASES Str_X15_HSK_CEC Str_X15_NR Str_X15_NR_AL
Min. :0.0000000 Min. :0.0000000 Min. :0.00e+00 Min. :0.0e+00 Min. :0.000000 Min. :0.000000 Min. :0.0000000
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.0e+00 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000
Str_X15_NR_BSa Str_X15_NR_BSP Str_X15_NR_CA Str_X15_NR_CEC Str_X15_NR_CMR Str_X15_NR_ESP Str_X15_NR_H
Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.0000000
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000
Str_X15_NR_K Str_X15_NR_MG Str_X15_NR_MN Str_X15_NR_NA Str_X15A1_CA Str_X15A1_CEC Str_X15A1_K
Min. :0.000000 Min. :0.000000 Min. :0.0000000 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.0000
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000
Str_X15A1_MG Str_X15A1_MN Str_X15A1_NA Str_X15A2_CA Str_X15A2_CEC Str_X15A2_K Str_X15A2_MG
Min. :0.0000 Min. :0.00e+00 Min. :0.0000000 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.00000
1st Qu.:0.0000 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00000
Str_X15A2_NA Str_X15A3_NA Str_X15B1_CA Str_X15B1_K Str_X15B1_MG Str_X15B1_NA Str_X15B2_CA
Min. :0.000000 Min. :0.0000000 Min. :0.00e+00 Min. :0.00e+00 Min. :0.0000000 Min. :0.000000 Min. :0.0000000
1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000
Str_X15B2_CEC Str_X15B2_K Str_X15B2_MG Str_X15B2_NA Str_X15C1_CA Str_X15C1_CEC Str_X15C1_K
Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :0.00000 Min. :0.000000
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.000000
Str_X15C1_MG Str_X15C1_NA Str_X15D1_AL Str_X15D1_CA Str_X15D1_CEC Str_X15D1_K Str_X15D1_MG
Min. :0.000000 Min. :0.00000 Min. :0.00e+00 Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.0000000
1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000
Str_X15D1_NA Str_X15D2_CA Str_X15D2_CEC Str_X15D2_K Str_X15D2_MG Str_X15D2_NA Str_X15E1_AL
Min. :0e+00 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000
1st Qu.:0e+00 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000
Str_X15E1_CA Str_X15E1_CEC Str_X15E1_H Str_X15E1_K Str_X15E1_MG Str_X15E1_MN Str_X15E1_NA
Min. :0.000000 Min. :0.00e+00 Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :0.000000 Min. :0.0000000
1st Qu.:0.000000 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000
Str_X15E2_CA Str_X15E2_K Str_X15E2_MG Str_X15E2_NA Str_X15F1_CA Str_X15F1_CEC Str_X15F1_K Str_X15F1_MG
Min. :0 Min. :0 Min. :0 Min. :0 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.000000
1st Qu.:0 1st Qu.:0 1st Qu.:0 1st Qu.:0 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
Str_X15F1_NA Str_X15F2 Str_X15F2_AL Str_X15F3 Str_X15F4 Str_X15G_C Str_X15G_C_AL1
Min. :0.000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.000000 Min. :0.000000
1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.000000
Str_X15G_C_AL2 Str_X15G_C_H1 Str_X15G_H Str_X15G1 Str_X15G1_AL Str_X15G1_H Str_X15I3
Min. :0.0000000 Min. :0.000000 Min. :0.00e+00 Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :0.0000000
1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000
Str_X15I4 Str_X15J_BASES Str_X15J_C Str_X15J_H Str_X15J1 Str_X15L1 Str_X15L1_a
Min. :0.000000 Min. :0.00000 Min. :0.0000000 Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.000000
1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.0000000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.000000
Str_X15N1 Str_X15N1_a Str_X15N1_b Str_X17A_HF. Str_X17A_NR Str_X17A1 Str_X18_NR
Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.000e+00 Min. :0.000000 Min. :0.000000 Min. :0.0000000
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000e+00 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000
Str_X18_NR_K Str_X18A1 Str_X18A1_NR Str_X18B1 Str_X18B2 Str_X18F1_AL Str_X18F1_AS
Min. :0.000000 Min. :0.000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :0.0000000
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000
Str_X18F1_B Str_X18F1_CA Str_X18F1_CD Str_X18F1_CO Str_X18F1_CU Str_X18F1_FE Str_X18F1_K
Min. :0.0000000 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.0000000 Min. :0.000000 Min. :0.000000
1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.000000
Str_X18F1_MG Str_X18F1_MN Str_X18F1_MO Str_X18F1_NA Str_X18F1_NI Str_X18F1_P Str_X18F1_PB
Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000
Str_X18F1_S Str_X18F1_SE Str_X18F1_ZN Str_X19_COL Str_X19A1 Str_X19B_NR Str_X19B1
Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.0e+00 Min. :0.000000 Min. :0.000000 Min. :0.000000
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0e+00 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
Str_X2.00E.01 Str_X2.00E.02 Str_X2_LOI Str_X2A1 Str_X2D1 Str_X2Z1_R1 Str_X2Z1_R2
Min. :0.0000 Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.0000000 Min. :0.00000 Min. :0.000000
1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.0000000 1st Qu.:0.00000 1st Qu.:0.000000
Str_X2Z2_C Str_X2Z2_CLAY Str_X2Z2_CS Str_X2Z2_FS Str_X2Z2_S Str_X2Z2_Z Str_X3_C_B
Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.000000
1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.000000
Str_X3_NR Str_X3A_C_2.5 Str_X3A_TSS Str_X3A1 Str_X4_NR Str_X4A_C_1 Str_X4A_C_2.5
Min. :0.000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.00000 Min. :0.000000 Min. :0.00000
1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000
Str_X4A1 Str_X4B_AL Str_X4B_AL_NR Str_X4B_C_2.5 Str_X4B1 Str_X4B2 Str_X4C_C_1
Min. :0.00000 Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.0000 Min. :0.0000 Min. :0.000000
1st Qu.:0.00000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000000
Str_X4C1 Str_X4G_NR Str_X5_C_B Str_X5_NR Str_X5A_C_2.5 Str_X5A_NR Str_X5A1
Min. :0.000000 Min. :0.0e+00 Min. :0.00000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000
1st Qu.:0.000000 1st Qu.:0.0e+00 1st Qu.:0.00000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000
Str_X5A2 Str_X6_DC Str_X6A1 Str_X6A1_UC Str_X6B1 Str_X6B2 Str_X6B3
Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.00000 Min. :0.000000
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.00000 1st Qu.:0.000000
Str_X6Z Str_X7_C_B Str_X7_NR Str_X7A1 Str_X7A2 Str_X7A2a Str_X7A5
Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :0.000000 Min. :0.0000000 Min. :0.000000
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000
Str_X7B1 Str_X7C_CASO4 Str_X7C1 Str_X7C1a Str_X7C1b Str_X7C1d Str_X7C1e
Min. :0.0000000 Min. :0.0000000 Min. :0.00e+00 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000
Str_X8A1 Str_X9.00E.02 Str_X9_E_NR Str_X9_NR Str_X9A_HCL Str_X9A_HCLP2O5 Str_X9A_HF.
Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.00e+00
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.00e+00
Str_X9A_NR Str_X9A_S14 Str_X9A1 Str_X9A3 Str_X9A3a Str_X9B_9C Str_X9B_NR
Min. :0.0000000 Min. :0.0000000 Min. :0.00e+00 Min. :0.000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000
Str_X9B1 Str_X9B2 Str_X9B2_COL Str_X9BUFF_0 Str_X9BUFF_0.5 Str_X9BUFF_1 Str_X9BUFF_2
Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000
Str_X9BUFF_4 Str_X9C2 Str_X9D2 Str_X9E Str_X9G_BSES Str_X9G1 Str_X9G2
Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000
1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000
Str_X9H_NR Str_X9H1 Str_X9I1 Str_X9J2 Str_X9R1 Str_M1a Str_MIN_EC Str_MIN_NR_K2O
Min. :0 Min. :0.000000 Min. :0.001248 Min. :0.000000 Min. :0.0000000 Min. :0 Min. :0.000000 Min. :0.000000
1st Qu.:0 1st Qu.:0.001597 1st Qu.:0.001248 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0 1st Qu.:0.000000 1st Qu.:0.000000
Str_P10_1m2m Str_P10_20_100 Str_P10_20_75 Str_P10_20_75a Str_P10_75_106 Str_P10_CF_C Str_P10_CF_CS
Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.0000000 Min. :0.000000 Min. :0.00000 Min. :0.00000
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000
Str_P10_CF_FS Str_P10_CF_S Str_P10_CF_Z Str_P10_GRAV Str_P10_gt2m Str_P10_gt2MI Str_P10_gt2OM
Min. :0.0000 Min. :0.000000 Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.00e+00 Min. :0.00e+00
1st Qu.:0.0000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00e+00 1st Qu.:0.00e+00
Str_P10_HYD_C Str_P10_HYD_CS Str_P10_HYD_FS Str_P10_HYD_Z Str_P10_NR_C Str_P10_NR_CS Str_P10_NR_FS
Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.00000 Min. :0.00000 Min. :0.00000
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
Str_P10_NR_S Str_P10_NR_Saa Str_P10_NR_Z Str_P10_NR_ZC Str_P10_PB_C Str_P10_PB_CS Str_P10_PB_FS
Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00e+00 Min. :0.00000 Min. :0.00000 Min. :0.00000
1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00e+00 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
Str_P10_PB_S Str_P10_PB_Z Str_P10_PB1_C Str_P10_PB1_CS Str_P10_PB1_FS Str_P10_PB1_Z Str_P10_S_0.20
Min. :0.000000 Min. :0.00000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000
1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000
Str_P10_S_0.48 Str_P10_S_1 Str_P10_S_1000 Str_P10_S_125 Str_P10_S_15.6 Str_P10_S_2 Str_P10_S_20
Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.000000
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
Str_P10_S_2000 Str_P10_S_250 Str_P10_S_3.9 Str_P10_S_31.2 Str_P10_S_500 Str_P10_S_53 Str_P10_S_63
Min. :0.000000 Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.000000 Min. :0.00000 Min. :0.0000000
1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.0000000
Str_P10_S_7.8 Str_P10100_200 Str_P10106_150 Str_P10150_180 Str_P10180_300 Str_P10200_500 Str_P10200_600
Min. :0.000000 Min. :0.0000000 Min. :0.00000 Min. :0.000000 Min. :0.000000 Min. :0.0000000 Min. :0.0000000
1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000
Str_P102002000 Str_P10300_600 Str_P105002000 Str_P106001000 Str_P106002000 Str_P10A1_C Str_P10A1_CS
Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.000000 Min. :0.000000
1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.000000
Str_P10A1_FS Str_P10A1_Z Str_P3A_NR Str_P3A1 Str_P3A1_C4 Str_P3A1_CLOD Str_P3A1_e
Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.00000 Min. :0.000000 Min. :0.000000 Min. :0.00e+00
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00e+00
Str_P3A2_McK Str_P3A2_McKMP Str_P3B_GV_01 Str_P3B_GV_03 Str_P3B_GV_15 Str_P3B_NR_005 Str_P3B_NR_01
Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.0000000 Min. :0.00e+00 Min. :0.0000000
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.0000000
Str_P3B_NR_15 Str_P3B_VL_01 Str_P3B_VL_15 Str_P3B1GV_15 Str_P3B1VL_1 Str_P3B1VL_15 Str_P3B2GV_1
Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.00e+00
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.00e+00
Str_P3B2GV_15 Str_P3B2GV_5 Str_P3B2VL_03 Str_P3B2VL_1 Str_P3B2VL_15 Str_P3B2VL_5 Str_P3B3VLa001
Min. :0.0e+00 Min. :0.00e+00 Min. :0.0000000 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.000000
1st Qu.:0.0e+00 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
Str_P3B3VLa005 Str_P3B3VLa01 Str_P3B3VLa03 Str_P3B3VLa06 Str_P3B3VLaSAT Str_P3B3VLb001 Str_P3B3VLb003
Min. :0.000000 Min. :0.0000000 Min. :0.000000 Min. :0.00e+00 Min. :0.00000 Min. :0.000000 Min. :0.000000
1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.00e+00 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.000000
Str_P3B3VLb005 Str_P3B3VLb01 Str_P3B3VLb03 Str_P3B3VLb05 Str_P3B3VLb06 Str_P3B3VLbSAT Str_P3B3VLc001
Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.00000 Min. :0.000000 Min. :0.000000
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.000000
Str_P3B3VLc003 Str_P3B3VLc005 Str_P3B3VLc01 Str_P3B3VLc03 Str_P3B3VLc06 Str_P3B3VLcSAT Str_P3B3VLd06
Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.00000 Min. :0.0000000 Min. :0.000000 Min. :0.0000000
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000
Str_P3B3VLd1 Str_P3B3VLd15 Str_P3B3VLd3 Str_P3B3VLd5 Str_P3B3VLe004 Str_P3B3VLe01 Str_P3B3VLe03
Min. :0.0000000 Min. :0.00000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.0000000
1st Qu.:0.0000000 1st Qu.:0.00000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000
Str_P3B3VLe06 Str_P3B3VLe15 Str_P3B3VLe2 Str_P3B3VLe7 Str_P3B4GV_01 Str_P3B4VL_005 Str_P3B5GV_01
Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :0.000000 Min. :0.000000
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
Str_P4_100DMcK Str_P4_10DMcK Str_P4_30_LOV Str_P4_30DMcK Str_P4_50_McK Str_P4_50DMcK Str_P4_sat
Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.00e+00 Min. :0.0000000 Min. :0.0000000 Min. :0.00e+00
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.00e+00
Str_P4_sat_FH Str_P4_sat_For Str_P4_sat_LOV Str_P4_sat_McK Str_P5_COLE Str_P5_LS_MOD Str_P6_LP
Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000 Min. :0.00000 Min. :0.0000000 Min. :0.000000
1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.00000 1st Qu.:0.0000000 1st Qu.:0.000000
Str_PWS1.2mm Str_PWS20.63 Str_PWS212.425 Str_PWS425.1mm Str_PWS63.212 Str_TE_NR_AL Str_TE_NR_AL2O
Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.00000 Min. :0.000000 Min. :0.0000000 Min. :0.0000000
1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.0000000
Str_TE_NR_CA Str_TE_NR_FE20 Str_TE_NR_MG Str_TE_NR_NA Str_TE_NR_SI02 Str_TE_NR_TI02 Str_TE_XRF_MG
Min. :0.00e+00 Min. :0.0000000 Min. :0.0000000 Min. :0.00e+00 Min. :0.00e+00 Min. :0.0000000 Min. :0.0000000
1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.0000000
Str_TE_XRFAL Str_TE_XRFCA Str_TE_XRFNA Str_TE_XRFSI02 Str_TE_XRFTIO2 Str_XRD_C_Amp Str_XRD_C_An
Min. :0.00e+00 Min. :0.0000000 Min. :0.00e+00 Min. :0.0000000 Min. :0.000000 Min. :0.00e+00 Min. :0.0000000
1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.00e+00 1st Qu.:0.0000000
Str_XRD_C_Bhm Str_XRD_C_Bt Str_XRD_C_Cal Str_XRD_C_Ch2 Str_XRD_C_Chl Str_XRD_C_Fsp Str_XRD_C_Gbs
Min. :0.00e+00 Min. :0.0000000 Min. :0.00e+00 Min. :0.0000000 Min. :0.00e+00 Min. :0.00e+00 Min. :0.0000000
1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.00e+00 1st Qu.:0.0000000
Str_XRD_C_Gth Str_XRD_C_Hem Str_XRD_C_Ht0 Str_XRD_C_Ilt Str_XRD_C_Is Str_XRD_C_K2O Str_XRD_C_Ka
Min. :0.0000000 Min. :0.000000 Min. :0.0000000 Min. :0.000000 Min. :0.0000 Min. :0.0000000 Min. :0.00000
1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:0.0000000 1st Qu.:0.00000
Str_XRD_C_Kln Str_XRD_C_Lp Str_XRD_C_Mag Str_XRD_C_Mca Str_XRD_C_Mgh Str_XRD_C_Mnt Str_XRD_C_Ms
Min. :0.0000000 Min. :0.00e+00 Min. :0.00e+00 Min. :0.00e+00 Min. :0.0000000 Min. :0.0000000 Min. :0.0000000
1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.00e+00 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000
Str_XRD_C_Plg Str_XRD_C_Plm Str_XRD_C_Qz Str_XRD_C_Rt Str_XRD_C_Sme Str_XRD_C_Tc Str_XRD_C_Vrm
Min. :0.00e+00 Min. :0.00e+00 Min. :0.000000 Min. :0.00e+00 Min. :0.0000000 Min. :0.00e+00 Min. :0.0000000
1st Qu.:0.00e+00 1st Qu.:0.00e+00 1st Qu.:0.000000 1st Qu.:0.00e+00 1st Qu.:0.0000000 1st Qu.:0.00e+00 1st Qu.:0.0000000
[ reached getOption("max.print") -- omitted 4 rows ]
# Find the best model with the best cost parameter via 10-fold cross-validations
# the tunning part of svm, which will take lots of time to run
tryTypes=c(0:7)
tryCosts=c(1000,1,0.001)
bestCost=NA
bestAcc=0.6290723
bestType=NA
for(ty in tryTypes){
for(co in tryCosts){
acc=LiblineaR(data=train_set[,-1],target=train_set[,c("Str_h_texture")],type=7,cost=co,bias=1,verbose=FALSE)
cat("Results for C=",co," : ",acc," accuracy.\n",sep="")
if(acc>bestAcc){
bestCost=co
bestAcc=acc
bestType=ty
}
}
}
LIBLINEAR is a linear classifier for data with millions of instances and features. It supports L2-regularized classifiers, L2-loss linear SVM, L1-loss linear SVM, and logistic regression (LR).LiblineaR allows the estimation of predictive linear models for classification and regression, such as L1- or L2-regularized logistic regression, L1- or L2-regularized L2-loss support vector classification, L2-regularized L1-loss support vector classification and multi-class support vector classification. It also supports L2-regularized support vector regression (with L1- or L2-loss). The estimation of the models is particularly fast as compared to other libraries.
svmStarttime <- Sys.time()
svmClassifier <- LiblineaR(data = train_set[,-1],target = train_set[,c("Str_h_texture")],bias=1,cost = 1000)
svmPredictTrain <- predict(svmClassifier,train_set[,-1],proba=TRUE,decisionValues=TRUE)
svmPredictTrainTable <- table(svmPredictTrain$predictions,train_set[,c("Str_h_texture")])
sumElementinTable <- function(a,c,r){
sum = 0
for (i in c){
if (i %in% r){
sum = sum + a[i,i]
}
}
return(sum)
}
svmTestcol <- colnames(svmPredictTestTable)
svmTestrow <- rownames(svmPredictTestTable)
svmTraincol <- colnames(svmPredictTrainTable)
svmTrainrow <- rownames(svmPredictTrainTable)
svmPredictTestScore <- sumElementinTable(svmPredictTestTable,svmTestcol,svmTestrow)/sum(svmPredictTestTable)
svmPredictTrainScore <- sumElementinTable(svmPredictTrainTable,svmTraincol,svmTrainrow)/sum(svmPredictTrainTable)
# the time of svm is:
cat("the running time of svm is",svmTimeTaken)
the running time of svm is 50.21468
#the score of svm is
cat("the train score of svm algorithm is ",svmPredictTrainScore,'\n')
the train score of svm algorithm is 0.3311756
cat("the test score of svm algorithm is ",svmPredictTestScore)
the test score of svm algorithm is 0.3023567
cartFit <- rpart(Str_h_texture ~ .,data = train_set,control = rpart.control(cp = 0.0001))
#get cp value
printcp(cartFit)
choose the CP with lowest xerror
cartstartTime <- Sys.time()
fit.pruned = prune(cartFit, cp = 0.00020393)
cartPrediction <- predict(fit.pruned, test_set, type = "vector")
cartendTime <- Sys.time()
cartTimeTaken <- cartendTime - cartendTime
data.frame(test_set,cartPrediction)
cartPrediction = round(cartPrediction,0)
cartTable <- table(test_set$Str_h_texture,cartPrediction)
cartTable
cartPrediction
5 6 8 9 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 1 0 0 0 0 2 1 0 0 0 0 0 1 0 0 1 3 2 0 1 1 3 0 0 0 1 1
4 0 0 0 0 0 0 0 0 0 0 4 0 2 2 0 0 2 3 0 0 12 0 0 0 2 4 1
5 0 1 4 0 6 50 4 5 22 23 18 13 3 16 2 18 12 18 0 11 24 15 8 52 1 21 30
6 0 0 0 0 0 0 1 1 3 10 0 0 0 0 0 0 11 8 0 4 0 0 3 8 0 0 4
7 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 2 0 0 1 2 0
8 0 0 0 0 0 0 1 0 1 10 4 0 0 1 0 3 3 0 0 0 3 3 0 2 0 8 4
9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11 0 0 0 1 0 5 0 1 3 0 9 8 4 3 0 0 5 10 5 1 14 4 3 1 0 16 8
12 0 0 0 0 0 3 0 0 0 0 2 0 2 0 0 1 7 3 0 2 12 3 1 1 1 15 3
13 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
14 0 0 0 1 0 2 0 0 10 13 3 0 2 6 1 3 30 4 0 3 8 6 3 2 1 6 14
15 0 0 0 0 0 10 1 0 0 6 5 1 5 8 1 2 52 22 1 19 12 5 4 19 4 8 11
16 0 0 0 0 0 0 0 0 0 4 0 0 0 1 0 0 0 1 0 0 3 0 3 3 0 1 5
17 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 6 0 0 0 0 0 7 2
19 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
cartPrediction
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 61 64
1 0 0 0 1 4 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0 3 3 0 27 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 4 3 0 49 9 5 2 1 4 0 1 0 0 0 0 1 1 2 0 0 0 0 0 0 0
5 26 31 12 64 452 38 19 21 31 6 12 2 5 5 3 17 4 3 0 4 0 5 3 0 1 1
6 0 3 2 6 59 10 5 4 3 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0
7 0 0 1 0 7 0 4 1 1 0 0 0 0 1 0 4 0 1 0 0 0 0 0 0 0 0
8 2 7 6 3 124 12 7 1 3 2 2 3 1 2 2 4 7 3 2 0 0 2 0 4 0 0
9 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11 1 5 15 14 233 19 21 5 18 10 9 17 5 10 5 6 8 3 2 0 0 3 0 0 0 0
12 0 8 4 4 102 2 3 1 2 8 1 1 0 7 12 2 1 0 0 0 0 0 0 0 0 0
13 0 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
14 3 5 2 6 116 12 4 4 3 2 6 0 1 0 0 0 0 0 2 1 1 0 0 0 0 0
15 3 15 4 4 178 21 10 7 7 4 0 2 3 3 1 3 0 3 0 1 0 0 0 0 0 0
16 0 2 0 3 36 4 0 11 0 0 3 0 0 1 2 0 0 0 1 0 0 0 1 0 0 0
17 0 0 0 0 9 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0
18 0 0 0 13 29 6 3 3 2 0 0 1 3 0 1 3 0 0 0 0 0 0 0 0 0 0
19 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[ reached getOption("max.print") -- omitted 45 rows ]
calculate the score of cart model
cartrow <- rownames(cartTable)
cartcol <- colnames(cartTable)
cartscore <- sumElementinTable(cartTable,cartrow,cartcol)/sum(cartTable)
the time of cart model
cat("the time of cart",cartTimeTaken)
the time of cart 0
the score of cart model
cat('the score of cart model',cartscore)
the score of cart model 0.02074463
separate x and y from train_set and test_set
train_set.num_X <- select (train_set,-c(Str_h_texture))
test_set.num_X <- select (test_set,-c(Str_h_texture))
start lightgbm machine learning algorithms
lstarttime <- Sys.time()
ltrain = lgb.Dataset(data = as.matrix(train_set.num_X),label = train_set$Str_h_texture, free_raw_data = FALSE)
params <- list(objective="regression", metric="l2")
model <- lgb.cv(params,
ltrain ,
10,
nfold=5,
min_data=1,
learning_rate=1,
early_stopping_rounds=10,
Depth = 8,
lambda_l1 = 10,
lambda_l2 = 10
)
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38581
[LightGBM] [Info] Number of data points in the train set: 37694, number of used features: 468
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037081 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38581
[LightGBM] [Info] Number of data points in the train set: 37694, number of used features: 468
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037630 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38581
[LightGBM] [Info] Number of data points in the train set: 37694, number of used features: 468
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032534 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38581
[LightGBM] [Info] Number of data points in the train set: 37693, number of used features: 468
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38581
[LightGBM] [Info] Number of data points in the train set: 37693, number of used features: 468
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Info] Start training from score 37.875524
[LightGBM] [Info] Start training from score 37.803815
[LightGBM] [Info] Start training from score 37.814639
[LightGBM] [Info] Start training from score 37.763510
[LightGBM] [Info] Start training from score 37.848672
[1]: valid's l2:255.554+1.93548
[2]: valid's l2:252.462+2.45908
[3]: valid's l2:251.348+2.5486
[4]: valid's l2:250.173+2.01428
[5]: valid's l2:248.854+2.47545
[6]: valid's l2:248.224+2.60872
[7]: valid's l2:248.128+2.67808
[8]: valid's l2:247.48+2.80606
[9]: valid's l2:247.181+3.28257
[10]: valid's l2:247.176+3.60095
lstoptime <- Sys.time()
num_leaves: This is the main parameter to control the complexity of the tree model. Theoretically, we can set num_leaves = 2^(max_depth) to obtain the same number of leaves as depth-wise tree. However, this simple conversion is not good in practice. The reason is that a leaf-wise tree is typically much deeper than a depth-wise tree for a fixed number of leaves. Unconstrained depth can induce over-fitting. Thus, when trying to tune the num_leaves, we should let it be smaller than 2^(max_depth). For example, when the max_depth=7 the depth-wise tree can get good accuracy, but setting num_leaves to 127 may cause over-fitting, and setting it to 70 or 80 may get better accuracy than depth-wise.
min_data_in_leaf: This is a very important parameter to prevent over-fitting in a leaf-wise tree. Its optimal value depends on the number of training samples and num_leaves. Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting. In practice, setting it to hundreds or thousands is enough for a large dataset.
max_depth: You also can use max_depth to limit the tree depth explicitly.
ltest = lgb.Dataset.create.valid(ltrain , as.matrix(test_set.num_X), label = test_set$Str_h_texture)
valids <- list(test = ltest)
grid_search <- expand.grid(Depth = 1:8,
L1 = 8:16,
L2 = 8:16)
model <- list()
perf <- numeric(nrow(grid_search))
for (i in 1:nrow(grid_search)) {
model[[i]] <- lgb.train(list(objective = "regression",
metric = "l2",
lambda_l1 = grid_search[i, "L1"],
lambda_l2 = grid_search[i, "L2"],
max_depth = grid_search[i, "Depth"]),
ltrain,
2,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 5,
num_leaves = 2,
num_iterations = 1000,
min_gain_to_split = 500,)
perf[i] <- min(rbindlist(model[[i]]$record_evals$test$l2))
}
cat("Model ", which.min(perf), " is lowest loss: ", min(perf), sep = "")
print(grid_search[which.min(perf), ])
Algorithms score is around 0.38 and computational time is:
lgbtaketime <- lstoptime - lstarttime
cat("The algorithms takes ", lgbtaketime, "seconds")
The algorithms takes 2.367668 seconds
catstartTime <- Sys.time()
fit_params <- list(l2_leaf_reg = 0.001,
depth=6,
learning_rate = 0.1,
iterations = 100,
random_seed = 233)
pool = catboost.load_pool(as.matrix(train_set.num_X), label = as.integer(train_set[,1]))
model <- catboost.train(pool, params = fit_params)
0: learn: 16.5371834 total: 32.2ms remaining: 3.19s
1: learn: 16.4819241 total: 69.2ms remaining: 3.39s
2: learn: 16.4321274 total: 106ms remaining: 3.42s
3: learn: 16.3910531 total: 138ms remaining: 3.32s
4: learn: 16.3533938 total: 168ms remaining: 3.19s
5: learn: 16.3204430 total: 203ms remaining: 3.18s
6: learn: 16.2939862 total: 234ms remaining: 3.11s
7: learn: 16.2683750 total: 271ms remaining: 3.11s
8: learn: 16.2458727 total: 302ms remaining: 3.05s
9: learn: 16.2231423 total: 341ms remaining: 3.07s
10: learn: 16.2049919 total: 375ms remaining: 3.03s
11: learn: 16.1861848 total: 409ms remaining: 3s
12: learn: 16.1700804 total: 443ms remaining: 2.97s
13: learn: 16.1504023 total: 482ms remaining: 2.96s
14: learn: 16.1353299 total: 518ms remaining: 2.94s
15: learn: 16.1186926 total: 563ms remaining: 2.96s
16: learn: 16.1051210 total: 602ms remaining: 2.94s
17: learn: 16.0876521 total: 638ms remaining: 2.9s
18: learn: 16.0711332 total: 668ms remaining: 2.85s
19: learn: 16.0593029 total: 700ms remaining: 2.8s
20: learn: 16.0460410 total: 736ms remaining: 2.77s
21: learn: 16.0358604 total: 774ms remaining: 2.74s
22: learn: 16.0280227 total: 816ms remaining: 2.73s
23: learn: 16.0157761 total: 855ms remaining: 2.71s
24: learn: 16.0087857 total: 887ms remaining: 2.66s
25: learn: 15.9957831 total: 920ms remaining: 2.62s
26: learn: 15.9863858 total: 957ms remaining: 2.59s
27: learn: 15.9777166 total: 987ms remaining: 2.54s
28: learn: 15.9688117 total: 1.02s remaining: 2.51s
29: learn: 15.9601138 total: 1.06s remaining: 2.47s
30: learn: 15.9527663 total: 1.1s remaining: 2.44s
31: learn: 15.9453010 total: 1.13s remaining: 2.4s
32: learn: 15.9372778 total: 1.16s remaining: 2.36s
33: learn: 15.9286856 total: 1.19s remaining: 2.31s
34: learn: 15.9228070 total: 1.22s remaining: 2.27s
35: learn: 15.9157059 total: 1.26s remaining: 2.23s
36: learn: 15.9106635 total: 1.29s remaining: 2.21s
37: learn: 15.9037768 total: 1.33s remaining: 2.17s
38: learn: 15.8939276 total: 1.37s remaining: 2.14s
39: learn: 15.8890469 total: 1.41s remaining: 2.11s
40: learn: 15.8833173 total: 1.44s remaining: 2.08s
41: learn: 15.8744792 total: 1.48s remaining: 2.04s
42: learn: 15.8702257 total: 1.51s remaining: 2s
43: learn: 15.8647871 total: 1.54s remaining: 1.96s
44: learn: 15.8575401 total: 1.57s remaining: 1.93s
45: learn: 15.8539751 total: 1.61s remaining: 1.89s
46: learn: 15.8486778 total: 1.64s remaining: 1.85s
47: learn: 15.8444692 total: 1.68s remaining: 1.81s
48: learn: 15.8384841 total: 1.71s remaining: 1.78s
49: learn: 15.8318644 total: 1.75s remaining: 1.75s
50: learn: 15.8211050 total: 1.78s remaining: 1.71s
51: learn: 15.8171272 total: 1.81s remaining: 1.67s
52: learn: 15.8131695 total: 1.84s remaining: 1.63s
53: learn: 15.8069144 total: 1.88s remaining: 1.6s
54: learn: 15.8029788 total: 1.91s remaining: 1.56s
55: learn: 15.7989711 total: 1.94s remaining: 1.53s
56: learn: 15.7945593 total: 1.98s remaining: 1.49s
57: learn: 15.7905434 total: 2.01s remaining: 1.45s
58: learn: 15.7863749 total: 2.04s remaining: 1.42s
59: learn: 15.7833903 total: 2.08s remaining: 1.39s
60: learn: 15.7762922 total: 2.12s remaining: 1.35s
61: learn: 15.7700376 total: 2.15s remaining: 1.32s
62: learn: 15.7651458 total: 2.19s remaining: 1.28s
63: learn: 15.7622770 total: 2.22s remaining: 1.25s
64: learn: 15.7581282 total: 2.25s remaining: 1.21s
65: learn: 15.7534077 total: 2.28s remaining: 1.18s
66: learn: 15.7509048 total: 2.32s remaining: 1.14s
67: learn: 15.7476133 total: 2.36s remaining: 1.11s
68: learn: 15.7446307 total: 2.39s remaining: 1.07s
69: learn: 15.7414847 total: 2.43s remaining: 1.04s
70: learn: 15.7370984 total: 2.46s remaining: 1s
71: learn: 15.7333530 total: 2.49s remaining: 969ms
72: learn: 15.7303575 total: 2.53s remaining: 935ms
73: learn: 15.7274914 total: 2.56s remaining: 898ms
74: learn: 15.7233322 total: 2.59s remaining: 864ms
75: learn: 15.7191933 total: 2.63s remaining: 829ms
76: learn: 15.7132419 total: 2.66s remaining: 795ms
77: learn: 15.7114110 total: 2.69s remaining: 760ms
78: learn: 15.7090804 total: 2.73s remaining: 726ms
79: learn: 15.7052953 total: 2.77s remaining: 692ms
80: learn: 15.7020065 total: 2.8s remaining: 657ms
81: learn: 15.6988539 total: 2.83s remaining: 622ms
82: learn: 15.6959701 total: 2.87s remaining: 588ms
83: learn: 15.6938343 total: 2.91s remaining: 554ms
84: learn: 15.6901454 total: 2.94s remaining: 520ms
85: learn: 15.6821179 total: 2.99s remaining: 486ms
86: learn: 15.6784857 total: 3.03s remaining: 452ms
87: learn: 15.6749618 total: 3.06s remaining: 418ms
88: learn: 15.6674241 total: 3.09s remaining: 382ms
89: learn: 15.6639432 total: 3.13s remaining: 348ms
90: learn: 15.6576915 total: 3.17s remaining: 314ms
91: learn: 15.6549104 total: 3.21s remaining: 279ms
92: learn: 15.6483326 total: 3.24s remaining: 244ms
93: learn: 15.6451033 total: 3.28s remaining: 209ms
94: learn: 15.6419415 total: 3.31s remaining: 174ms
95: learn: 15.6385591 total: 3.35s remaining: 140ms
96: learn: 15.6362077 total: 3.39s remaining: 105ms
97: learn: 15.6324720 total: 3.42s remaining: 69.8ms
98: learn: 15.6281495 total: 3.45s remaining: 34.9ms
99: learn: 15.6249271 total: 3.49s remaining: 0us
catstopTime <- Sys.time()
cattakenTime <- catstopTime - catstartTime
calculate the prediction:
#get the prediction
catprediction <- catboost.predict(model,
pool,
prediction_type = 'RawFormulaVal')
calculate the program score:
#round the prediction
catprediction <- round(catprediction,0)
catTable <- table(train_set$Str_h_texture,catprediction)
catTablerow <- rownames(catTable)
catTablecol <- colnames(catTable)
catscore <- sumElementinTable(catTable,catTablerow,catTablecol)/sum(catTable)
cat('The algorithm takes' ,cattakenTime , 'seconds')
The algorithm takes 4.16187 seconds
cat('The algorithm scores' ,catscore)
The algorithm scores 0.02073562
nbstarttime <- Sys.time()
nbClassifier <- naiveBayes(as.factor(Str_h_texture) ~ .,data = train_set,laplace=2)
nbTestPrediction <- predict(nbClassifier,test_set,type = "class")
nbTableTest <- table(nbTestPrediction,test_set$Str_h_texture)
nbTestTablerow <- rownames(nbTableTest)
nbTestTablecol <- colnames(nbTableTest)
nbTestTablescore<- sumElementinTable(nbTableTest,nbTestTablerow,nbTestTablecol)/sum(nbTableTest)
nbendtime <- Sys.time()
nbTrainPrediction <- predict(nbClassifier,train_set,type = "class")
nbTrainTable <- table(nbTrainPrediction,train_set$Str_h_texture)
nbTrainTablerow <- rownames(nbTrainTable)
nbTrainTablecol <- colnames(nbTrainTable)
nbTrainTablescore <- sumElementinTable(nbTrainTable,nbTrainTablerow,nbTrainTablecol)/sum(nbTrainTable)
nbtakentime <- nbendtime - nbstarttime
cat('NaiveBayes takes',nbtakentime,'seconds')
NaiveBayes takes 4.305502 seconds
cat('NaiveBayes score',nbTrainTablescore)
NaiveBayes score 355
fnbstartTime <- Sys.time()
dist <- fnb.detect_distribution(train_set.num_X)
gauss <- fnb.gaussian(train_set.num_X[,dist$gaussian], as.factor(train_set$Str_h_texture),sparse = TRUE,check = FALSE)
pred <- predict(gauss, train_set.num_X[,dist$gaussian])
fnbendTime <- Sys.time()
error <- mean(as.factor(train_set$Str_h_texture)!=pred)
print(error)
fnbtakentime <- fnbendTime - fnbstartTime
print("fastNaiveBayes takes ", fnbtakentime, "seconds")
We can use neuralnet() to train a NN model. Also, the train() function from caret can help us tune parameters. We can plot the result to see which set of parameters is fit our data the best.
tuning parameter
Model <- train(Str_h_texture ~ .,
data=train_set,
method="neuralnet",
### Parameters for layers
tuneGrid = expand.grid(.layer1=c(1:2), .layer2=c(0:2), .layer3=c(0)),
### Parameters for optmization
learningrate = 0.01,
threshold = 0.01,
stepmax = 5000
)
in nnclassifier y value should be normalized
train_set.norm <- train_set
maxStr_h_texture <- max(train_set.norm$Str_h_texture)
minStr_h_texture <- min(train_set.norm$Str_h_texture)
train_set.norm$Str_h_texture <- normalize(train_set.norm$Str_h_texture)
nnClassifier <- neuralnet(Str_h_texture ~ .,data=train_set.norm, likelihood = TRUE,
hidden = 1,linear.output = F,act.fct = "tanh")
print(nnClassifier$result.matrix)
plot(nnClassifier)
prediction
output<- compute(nnClassifier,train_set[,-1])
p1 <- output$net.result
p1 <- p1 * (maxStr_h_texture-minStr_h_texture)
p1 <- round(p1,0)
nntable<- table(train_set$Str_h_texture,p1)
Xgboost can work perfectly in sparse matrix but it unfortunately cannot run in 5 hours
xgb.train = xgb.DMatrix(data = as.matrix(train_set),label =as.matrix(train_set$Str_h_texture))
xgb.test = xgb.DMatrix(data = as.matrix(test_set),label = as.matrix(test_set$Str_h_texture))
validsoilTexture$Str_h_texture <- as.factor(validsoilTexture$Str_h_texture)
num_class = length(levels(validsoilTexture$Str_h_texture))
params = list(
booster="gbtree",
eta=0.001,
max_depth=5,
gamma=3,
subsample=0.75,
colsample_bytree=1,
objective="multi:softprob",
eval_metric="mlogloss",
num_class=num_class+1
)
# Train the XGBoost classifer
xgb.fit=xgb.train(
params=params,
data=xgb.train,
nrounds=10000,
nthreads=1,
early_stopping_rounds=10,
watchlist=list(val1=xgb.train,val2=xgb.test),
verbose=0
)
xgb.fit
Random Forest* The algorithm cannot run successfully since it will give an Error: cannot allocate vector of size 16.5 Gb random forest is bad for sparse data which can be found in https://stats.stackexchange.com/questions/28828/is-there-a-random-forest-implementation-that-works-well-with-very-sparse-data
RfClassifier = randomForest(Str_h_texture ~ .,data = train_set,proximity = T,mtry = 10)
rfTable <- table(predict(RfClassifier),train_set$Str_h_texture)
print(RfClassifier)
plot(RfClassifier)